These include what customers searched for, how they interacted with search results (click/book), whether or not the search result was a travel package. Expedia is interested in predicting which hotel group a user is going to book. Expedia has in-house algorithms to form hotel clusters, where similar hotels for a search (based on historical price, customer star ratings, geographical locations relative to city center, etc) are grouped together.
Column name | Description | Data type |
---|---|---|
date_time | Timestamp | string |
site_name | ID of the Expedia point of sale (i.e. Expedia.com, Expedia.co.uk, Expedia.co.jp, ...) |
int |
posa_continent | ID of continent associated with site_name | int |
user_location_country | The ID of the country the customer is located | int |
user_location_region | The ID of the region the customer is located | int |
user_location_city | The ID of the city the customer is located | int |
orig_destination_distance | Physical distance between a hotel and a customer at the time of search. A null means the distance could not be calculated |
double |
user_id | ID of user | int |
is_mobile | 1 when a user connected from a mobile device, 0 otherwise | tinyint |
is_package | 1 if the click/booking was generated as a part of a package (i.e. combined with a flight), 0 otherwise |
int |
channel | ID of a marketing channel | int |
srch_ci | Checkin date | string |
srch_co | Checkout date | string |
srch_adults_cnt | The number of adults specified in the hotel room | int |
srch_children_cnt | The number of (extra occupancy) children specified in the hotel room | int |
srch_rm_cnt | The number of hotel rooms specified in the search | int |
srch_destination_id | ID of the destination where the hotel search was performed | int |
srch_destination_type_id | Type of destination | int |
hotel_continent | Hotel continent | int |
hotel_country | Hotel country | int |
hotel_market | Hotel market | int |
is_booking | 1 if a booking, 0 if a click | tinyint |
cnt | Numer of similar events in the context of the same user session | bigint |
hotel_cluster | ID of a hotel cluster | int |
Column name | Description | Data type |
---|---|---|
srch_destination_id | ID of the destination where the hotel search was performed | int |
d1-d149 | latent description of search regions | double |
In [ ]:
Sys.setlocale("LC_TIME", "en_US.UTF-8")
Sys.setenv(LANG = "en_US.UTF-8")
library(ggplot2) # Data visualization
library(data.table) # Faster data reading
library(dplyr) # Data aggregation etc.
library(scales) # Plot scaling
library(gridExtra) # Arrange plots
library(corrplot) # Correlations
In [ ]:
train <- fread("bzcat ../../data/expedia/train500k.csv.bz2", sep = ",", header = TRUE)
In [ ]:
# convert to factors
cols <- colnames(train)
for (i in cols[!cols %in% c("date_time", "orig_destination_distance")]) {
train[[i]] <- as.factor(train[[i]])
}
# convert dates
train$date_time <- as.POSIXct(train$date_time)
train$date <- as.Date(train$date_time)
train$srch_ci <- as.Date(train$srch_ci)
train$srch_co <- as.Date(train$srch_co)
In [ ]:
str(train)
In [ ]:
train.agg <- train %>% group_by(date, is_booking) %>% summarize(count=n())
ggplot(train.agg, aes(x=date, y=count, color=is_booking)) +
geom_line(size=0.2) + theme(legend.position="top")
In [ ]:
isBook <- ggplot(train, aes(x=is_booking)) + geom_bar()
isMobl <- ggplot(train, aes(x=is_mobile)) + geom_bar()
isPckg <- ggplot(train, aes(x=is_package)) + geom_bar()
grid.arrange(isBook, isMobl, isPckg, nrow=1, ncol=3)
In [ ]:
chnl <- ggplot(train, aes(x=channel)) + geom_bar()
sdti <- ggplot(train, aes(x=srch_destination_type_id)) + geom_bar()
room <- ggplot(train, aes(x=srch_rm_cnt)) + geom_bar()
cnt <- ggplot(train, aes(x=cnt)) + geom_bar()
grid.arrange(chnl, sdti, room, cnt, nrow=4, ncol=1)
In [ ]:
srch_adlt <- ggplot(train, aes(x=srch_adults_cnt)) + geom_bar(fill="blue")
srch_chld <- ggplot(train, aes(x=srch_children_cnt)) + geom_bar(fill="blue")
grid.arrange(srch_adlt, srch_chld, nrow=2, ncol=1)
In [ ]:
posa_cont <- ggplot(train, aes(x=posa_continent)) + geom_bar(fill="blue")
hotl_cont <- ggplot(train, aes(x=hotel_continent)) + geom_bar(fill="blue")
grid.arrange(posa_cont, hotl_cont, nrow=2, ncol=1)
In [ ]:
ggplot(train, aes(x=hotel_cluster, fill=hotel_cluster)) + geom_bar() + theme(legend.position="none")
In [ ]:
bookings <- train[train$is_booking == 1]
ggplot(bookings[sample(nrow(bookings), 10000)], aes(x=date, y=srch_ci)) +
geom_point(color="blue", alpha=0.1, size=0.4)
In [ ]:
train$tripDur <- as.numeric(train$srch_co - train$srch_ci)
train$tripDur[train$tripDur < 0] <- NA
srch_tripDur <- ggplot(train[train$is_booking == 0], aes(x=tripDur)) +
geom_bar(binwidth = 1) + xlim(0,30) +
ggtitle("Searches") + theme(legend.position="none")
book_tripDur <- ggplot(train[train$is_booking == 1], aes(x=tripDur)) +
geom_bar(binwidth = 1) + xlim(0,30) +
ggtitle("Bookings") + theme(legend.position="none")
grid.arrange(srch_tripDur, book_tripDur, nrow=2, ncol=1)
In [ ]:
train$bookAhead <- as.numeric(train$srch_ci - train$date)
train$bookAhead[train$bookAhead < 0] <- NA
srch_bookAhead <- ggplot(train[train$is_booking == 0], aes(x=bookAhead)) +
geom_bar(binwidth = 7) + xlim(c(0,200)) +
ggtitle("Searches") + theme(legend.position="none")
book_bookAhead <- ggplot(train[train$is_booking == 1], aes(x=bookAhead)) +
geom_bar(binwidth = 7) + xlim(c(0,200)) +
ggtitle("Bookings") + theme(legend.position="none")
grid.arrange(srch_bookAhead, book_bookAhead, nrow=2, ncol=1)
In [ ]:
#train$week <- cut(train$date, "weeks")
train$Year <- as.numeric(format(train$date_time, "%Y"))
train$Month <- as.numeric(format(train$date_time, "%m"))
train$Hour <- as.numeric(format(train$date_time, "%H"))
# add weekdays (with ordered levels)
wd <- c("Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun")
train$weekday <- factor(weekdays(train$date, TRUE), levels = wd)
train$weekday_ci <- factor(weekdays(train$srch_ci, TRUE), levels = wd)
train$weekday_co <- factor(weekdays(train$srch_co, TRUE), levels = wd)
In [ ]:
srch_wd <- ggplot(train[train$is_booking == 0], aes(x=weekday, fill=weekday)) +
geom_bar() + ggtitle("Searches") + theme(legend.position="none") + ylim(0,100000)
book_wd <- ggplot(train[train$is_booking == 1], aes(x=weekday, fill=weekday)) +
geom_bar() + ggtitle("Bookings") + theme(legend.position="none") + ylim(0,8200)
srch_wd_ci <- ggplot(train[train$is_booking == 0], aes(x=weekday_ci, fill=weekday_ci)) +
geom_bar() + ggtitle("Searches") + theme(legend.position="none") + ylim(0,100000)
book_wd_ci <- ggplot(train[train$is_booking == 1], aes(x=weekday_ci, fill=weekday_ci)) +
geom_bar() + ggtitle("Bookings") + theme(legend.position="none") + ylim(0,8200)
srch_wd_co <- ggplot(train[train$is_booking == 0], aes(x=weekday_co, fill=weekday_co)) +
geom_bar() + ggtitle("Searches") + theme(legend.position="none") + ylim(0,100000)
book_wd_co <- ggplot(train[train$is_booking == 1], aes(x=weekday_co, fill=weekday_co)) +
geom_bar() + ggtitle("Bookings") + theme(legend.position="none") + ylim(0,8200)
grid.arrange(srch_wd, srch_wd_ci, srch_wd_co, book_wd, book_wd_ci, book_wd_co, nrow=2, ncol=3)
In [ ]:
train$withWeekend[train$is_booking == 1] <-
mapply(function(x,y) y-x > 6 | Reduce(`|`, weekdays(seq(x, y, by="day"), TRUE) %in% list('Sun', 'Sat')),
train[train$is_booking == 1]$srch_ci, train[train$is_booking == 1]$srch_co)
summary(train$withWeekend)
In [ ]:
ggplot(train[train$is_booking == 1 & train$is_package == 1], aes(x=bookAhead, y=tripDur)) +
geom_point(size=0.5, alpha=0.2, position="jitter", color="blue") + ggtitle("Package Bookings")
In [ ]:
cols <- c('site_name', 'posa_continent', 'user_location_country', 'user_location_region', 'user_location_city',
'is_mobile', 'is_package', 'channel', 'srch_adults_cnt', 'srch_children_cnt', 'srch_rm_cnt',
'srch_destination_id', 'srch_destination_type_id', 'is_booking', 'cnt', 'hotel_continent', 'hotel_country',
'hotel_market', 'Year', 'Month', 'Hour', 'weekday', 'tripDur', 'bookAhead', 'orig_destination_distance', 'hotel_cluster')
df <- data.table(train[,cols, with=F])
# impute missing values
df[is.na(df$tripDur)]$tripDur <- -1
df[is.na(df$bookAhead)]$bookAhead <- -1
df[is.na(df$orig_destination_distance)]$orig_destination_distance <- mean(df$orig_destination_distance, na.rm = T)
df[] <- lapply(df, as.integer)
corrplot(cor(df, method = "spearman"), order = "AOE")
In [ ]:
write.csv(train, gzfile("../../data/expedia/train_feat.csv.gz"))